library(tidyverse)library(ggplot2)library(dplyr)library(leaflet)library(usmap)library(sf)library(readr)library(scales)library(tidyr)library(lubridate)library(pheatmap)# Read in the raw data for Warnings and Citationswarnings =read_csv("2023_warning_data.csv", col_types =cols(Warnings_Date =col_date(format ="%m/%d/%Y"), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))citations =read_csv("2023_citation_data.csv", col_types =cols(Date =col_date(format ="%m/%d/%Y"), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))# Rename some columnscitations = citations %>%rename(ViolationDate = Date)# change Gender to sex in warnings and change date column namewarnings = warnings %>%rename(Sex = Gender)warnings = warnings %>%rename(ViolationDate = Warnings_Date)# Adjust Citations and prepare for Merge# Assumption that ID is the officer's IDcitations_processed = citations %>%mutate(outcome ="Citation",Gender =case_when( Sex =="M"~"Male", Sex =="F"~"Female",TRUE~"Other/Unknown" ),Year =year(ViolationDate),Month =month(ViolationDate),DayOfMonth =day(ViolationDate),Time =parse_date_time(Time, "HM"),data_type ="Citation" ) %>%select( outcome, Gender, Year, Month, DayOfMonth, ViolationDate, Time, Offense_Description = Charge,District = DISTRICT, Race, Ethnicity, Latitude, Longitude, OfficerID = ID, data_type )# Adjust Warnings and prepare for Mergewarnings_processed = warnings %>%mutate(outcome ="Warning",Gender =case_when( Sex =="M"~"Male", Sex =="F"~"Female",TRUE~"Other/Unknown" ),Year =year(ViolationDate),Month =month(ViolationDate),DayOfMonth =day(ViolationDate),Time =parse_date_time(Time, "HM"),data_type ="Warning" ) %>%select( outcome, Gender, Year, Month, DayOfMonth, ViolationDate, Time, Offense_Description, District = DISTRICT, Race, Ethnicity, Latitude = Lat, Longitude = Long, OfficerID = Officer_ID, data_type )# Combined for ultimate Data coordination!combined_wc =bind_rows(citations_processed, warnings_processed)# Add ultimate binary outcome! 0 = Citation, 1 = Warning/ Got out of ticketcombined_wc = combined_wc %>%mutate(BinaryOutcome =ifelse(outcome =="Warning", 1,0) )## Change to Title Case for District Namescombined_wc$District = tools::toTitleCase(tolower(combined_wc$District))## Examining Unverified data## After examination, unverified only makes up 0.0143 or 1.43% of the data set, so we will remove## because it is a very small portion of the total proportion. # combined_wc %>%# count(District) %>%# mutate(Proportion = n / sum(n)) %>%# arrange(desc(n))## Filter out Unverified and NAcombined_wc = combined_wc %>%filter(District !="Unverified")combined_wc = combined_wc %>%filter(!is.na(District))## Filter out Other/Unknown Gendercombined_wc_mf = combined_wc %>%filter(Gender !="Other/Unknown")## Stacked Bar chart for Outcome and Genderggplot(combined_wc_mf, aes(x = outcome, fill = Gender)) +geom_bar() +labs(title ="Count of Outcomes by Gender",x ="Outcome Type",y ="# of Incident",fill ="Gender" ) +theme_gray() +theme(plot.title =element_text(hjust =0.5)) +scale_fill_manual(values =c("Female"="lightpink2", "Male"="lightsteelblue"))
## Now for some visuals: Gender Chart## Examining the proportion of stops resulting in a Warning Vs Citation## the Warning rate is the proportion of incidents that are warnings.gender_warning_rate = combined_wc_mf %>%group_by(Gender) %>%summarise(Total_Incidents =n(),Warning_Rate =mean(BinaryOutcome) ) %>%ungroup()gender_chart =ggplot(gender_warning_rate,aes(x = Gender, y = Warning_Rate, fill = Gender)) +geom_col(show.legend =FALSE) +geom_text(aes(label = scales::percent(Warning_Rate, accuracy =0.1)),vjust =-0.5, size =5) +scale_y_continuous(labels = scales::percent, limits =c(0, max(gender_warning_rate$Warning_Rate) *1.1)) +labs(title ="Warning Rate by Gender",subtitle ="Proportion of stops resulting in a Warning (vs Citation)",x ="Gender",y ="Warning Rate" ) +theme_gray() +theme(plot.title =element_text(hjust =0.5)) +theme(plot.subtitle =element_text(hjust =0.5)) +scale_fill_manual(values =c("Female"="lightpink2", "Male"="lightsteelblue"))gender_chart
## Now the Chi-Squared Test starting with the Contingency Tablecontingency_tbl = combined_wc_mf %>%filter(Gender %in%c("Male", "Female")) %>%select(Gender, BinaryOutcome) %>%table()chi_sq_results =chisq.test(contingency_tbl)expected_counts = chi_sq_results$expected
`summarise()` has grouped output by 'DayOfMonth'. You can override using the
`.groups` argument.
# label for x-axisall_hours =as.character(0:23)# Convert to factorheatmap_days$Hour =factor(heatmap_days$Hour)heatmap_days$DayOfMonth =factor(heatmap_days$DayOfMonth)# Text for interactive partheatmap_days = heatmap_days %>%mutate(text =paste0("Hour: ", Hour, "\n", "Day: ", DayOfMonth, "\n", "Citation Rate: ",round(Citation_Rate,2)))## trying to get all axesplot_heatmap_discrete_axes =ggplot( heatmap_days, aes(x = Hour, y = DayOfMonth, fill = Citation_Rate, text = text)) +geom_tile(aes(width =1, height =1), color ="white", linewidth =0.1) +scale_fill_viridis(option ="magma",direction =-1,name ="Citation Rate",labels = scales::percent ) +scale_x_discrete(name ="Hour of Day (0 = Midnight, 12 = Noon)", drop =FALSE) +scale_y_discrete(drop =FALSE) +labs(title ="Citation Rate by Day of Month and Hour of Day",subtitle ="Higher intensity (darker color) indicates a higher proportion of Citations",y ="Day of the Month" ) +theme_minimal(base_size =12) +theme(axis.text.x =element_text(vjust =0.1, hjust =0.1),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),plot.title =element_text(face ="bold", hjust =0.5),plot.subtitle =element_text(hjust =0.5) )ggplotly(plot_heatmap_discrete_axes, tooltip ="text")
plot_heatmap_discrete_axes =ggplot( heatmap_days, aes(x = Hour, y = DayOfMonth, fill = Citation_Rate, text = text)) +geom_tile(aes(width =1, height =1), color ="white", linewidth =0.1) +scale_fill_viridis(option ="magma",direction =-1,name ="Citation Rate",labels = scales::percent ) +scale_x_discrete(name ="Hour of Day (0 = Midnight, 12 = Noon)", drop =FALSE) +scale_y_discrete(drop =FALSE) +labs(title ="Citation Rate by Day of Month and Hour of Day",subtitle ="Higher intensity (darker color) indicates a higher proportion of Citations",y ="Day of the Month" ) +theme_minimal(base_size =12) +theme(axis.text.x =element_text(vjust =0.1, hjust =0.1),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),plot.title =element_text(face ="bold", hjust =0.5),plot.subtitle =element_text(hjust =0.5) )plot_heatmap_discrete_axes
Arrest2023 <-read_csv("2023_arrest_data.csv", col_types =cols(ArresteeNumber =col_skip(), ArrestDate =col_date(format ="%m/%d/%Y"), ArrestTime =col_character(), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))## Remove any outliers by setting min/max lat and long.min_lat =38.6max_lat =39.0min_lon =-77.6max_lon =-77.0## Using all data but excluding any coordinates that are not within coordinatesarrest_data_clean = Arrest2023 %>%filter( Latitude >= min_lat, Latitude <= max_lat, Longitude >= min_lon, Longitude <= max_lon )## Include district boundaries by reading in shape file downloaded from Fairfax County sitedistrict_boundaries =st_read("Supervisor_Districts/Supervisor_Districts.shp")
Reading layer `Supervisor_Districts' from data source
`C:\Users\Nat\OneDrive - George Mason University - O365 Production\Git\nathaniams.github.io\Supervisor_Districts\Supervisor_Districts.shp'
using driver `ESRI Shapefile'
Simple feature collection with 9 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 11757190 ymin: 6905421 xmax: 11899000 ymax: 7070364
Projected CRS: NAD83 / Virginia North (ftUS)
## Transform to WGS84 projection to match basemap within leafletif (st_crs(district_boundaries)$epsg !=4326) { district_boundaries =st_transform(district_boundaries, 4326)}
library(nnet)combined_data$Outcome <-factor(combined_data$Outcome)combined_data$Race <-factor(combined_data$Race)combined_data$Gender <-factor(combined_data$Gender)combined_data$District <-factor(combined_data$District)combined_data$Ethnicity <-factor(combined_data$Ethnicity)model <-multinom(Outcome ~ Race + Gender + District + Ethnicity, data = combined_data)
# weights: 69 (44 variable)
initial value 133797.793412
iter 10 value 110273.346261
iter 20 value 109828.994978
iter 30 value 108047.740898
iter 40 value 106985.637011
iter 50 value 106983.922418
final value 106983.911647
converged